#Importing the different libraries to analyze the data
import plotly.offline as pyo
import plotly.graph_objs as go
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import norm
import statistics
%matplotlib inline
pyo.init_notebook_mode()
#Importing the dataset
df = pd.read_csv('Iris.csv')
#Reading the dataset
df
| SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
150 rows × 5 columns
# Dropping the ID column since there is no use for it, and does not contribute to our analysis
df = df.drop(columns = ['Id'])
#Getting an understanding of the data, looking at the mean, median etc.
df.groupby(['Species'])
df.describe()
| SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | |
|---|---|---|---|---|
| count | 150.000000 | 150.000000 | 150.000000 | 150.000000 |
| mean | 5.843333 | 3.054000 | 3.758667 | 1.198667 |
| std | 0.828066 | 0.433594 | 1.764420 | 0.763161 |
| min | 4.300000 | 2.000000 | 1.000000 | 0.100000 |
| 25% | 5.100000 | 2.800000 | 1.600000 | 0.300000 |
| 50% | 5.800000 | 3.000000 | 4.350000 | 1.300000 |
| 75% | 6.400000 | 3.300000 | 5.100000 | 1.800000 |
| max | 7.900000 | 4.400000 | 6.900000 | 2.500000 |
df.median()
/tmp/ipykernel_124/530051474.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
SepalLengthCm 5.80 SepalWidthCm 3.00 PetalLengthCm 4.35 PetalWidthCm 1.30 dtype: float64
# For this expolratory data analysis set, I will be doing random analysis without any end goal (i.e. answering questions).
# The point of EDA is ususally to discover patterns within the dataset, but since the Iris dataset has been around
# for a while, I thought that this would be a great oppurtunity to simply to showcase my EDA skills
# Looking into the number that each species has
plt.title('Species Count')
sns.countplot(df['Species'])
/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
<AxesSubplot:title={'center':'Species Count'}, xlabel='Species', ylabel='count'>
# Looking at the KDE plot for the dataset. KDE helps in visualizing the distribution of observations in a dataset.
# Sepal Length and Sepal Width are both normally distributed, Petal Length and Petal Width do not seem normally distributed
sns.kdeplot(data=df)
<AxesSubplot:ylabel='Density'>
# Scatterplot that plots SepalWidth vs SepalLength for each species of Iris flower
fig = px.scatter( df, x='SepalLengthCm', y='SepalWidthCm', color = "Species")
fig.show()
# Doing a regression analysis to see if there is any correlation between SepalWidth and SepalLength.
# Iris-Setosa: 0.557681; This means that 55.7% of the data fits the regression model.
# Iris-Versicolor: 0.26582; This means that 26.5% of the data fits the regression model.
# Iris-Virginica: 0.209057; This means that 20.9% of the data fits the regression model.
fig = px.scatter( df, x='SepalLengthCm', y='SepalWidthCm', opacity=0.65, trendline='ols', color = "Species")
fig.show()
/opt/conda/lib/python3.8/site-packages/statsmodels/compat/pandas.py:65: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
# Scatterplot that plots PetalWidth vs PetalLength for each species of Iris flower
fig = px.scatter(df, x="PetalWidthCm", y="PetalLengthCm", color="Species")
fig.show()
# Doing a regression analysis to see if there is any correlation between SepalWidth and SepalLength.
# Iris-Setosa: 0.093825; This means that 9.3% of the data fits the regression model.
# Iris-Versicolor: 0.618847; This means that 61.8% of the data fits the regression model.
# Iris-Virginica: 0.103754; This means that 10.4% of the data fits the regression model.
fig = px.scatter(df, x='PetalLengthCm', y='PetalWidthCm', opacity=0.65,trendline='ols', color = "Species")
fig.show()
#Determining Correlation
# Drawing a heatmap to look at the correlation between the different variables to determine if there is a
# strong or weak correlation
# Taking a quick look at the correlation we can see that there seems to be strong correlation between:
# Petal Length and Petal Width; Sepal Length and Petal Width; Sepal Length and Petal Length
# We can see that there is a strong negative correlation between:
# Sepal Width and Petal Length;
# Sepal Width and Petal Width
q = df.corr()
sns.heatmap(q, annot = True)
<AxesSubplot:>
sns.pairplot(df, hue="Species", palette="viridis", diag_kind="kde", height=3.5)
<seaborn.axisgrid.PairGrid at 0x7fc0d13985e0>
# Drawing a boxplot for each of the variables. The green triangle represents the mean. This has combined all the different species together
# For Sepal Length we can see that the mean is slightly above the median, meaning that distribution is slightly positively skewed
# For Sepal Width we can see that the mean is slightly above the median, meaning that the distribution is slightly positively skewed
# For Petal Length we can see that the mean is below the median, meaning that the distribution is negatively skewed
# For Petal Length we can see that the mean is slightly below the median, meaning that the distribution is slightly negatively skewed
sns.catplot(data=df, orient="h", kind="box", showmeans = True)
<seaborn.axisgrid.FacetGrid at 0x7fc0d0886af0>
# We can furthur dissect this by looking at the skewness of the dataframe.
# For Sepal Length we can see that it is positivly skewed
# For Sepal Width we can see that it is positively skewed
# For Petal Length we can see that it is negatively skewed
# For Petal Width we can see that it is negatively skewed
df.skew()
/tmp/ipykernel_124/1665899112.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
SepalLengthCm 0.314911 SepalWidthCm 0.334053 PetalLengthCm -0.274464 PetalWidthCm -0.104997 dtype: float64
# We can furthur dissect this by looking at the kurtosis of the dataframe
df.kurt()
/tmp/ipykernel_124/1257127604.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
SepalLengthCm -0.552064 SepalWidthCm 0.290781 PetalLengthCm -1.401921 PetalWidthCm -1.339754 dtype: float64
# A violinplot is used to visualize the distribution of numerical data.
fig = px.violin(df, y="SepalLengthCm", x="Species", color="Species", box=True, points="all",
hover_data=df.columns)
fig.show()
fig = px.violin(df, y="SepalWidthCm", x="Species", color="Species", box=True, points="all",
hover_data=df.columns)
fig.show()
fig = px.violin(df, y="PetalLengthCm", x="Species", color="Species", box=True, points="all",
hover_data=df.columns)
fig.show()
fig = px.violin(df, y="PetalWidthCm", x="Species", color="Species", box=True, points="all",
hover_data=df.columns)
fig.show()
fig = px.scatter(df, x="PetalWidthCm", y="SepalLengthCm", color="Species", size="PetalWidthCm")
fig.update_layout(legend=dict(orientation="h",yanchor="bottom",y=1.02,xanchor="right",x=1))
fig.show()